%pip install pandas numpy matplotlib seaborn plotly nbformat deepl dotenv wordcloud

Requirement already satisfied: pandas in ./venv/lib/python3.12/site-packages (2.3.1)
Requirement already satisfied: numpy in ./venv/lib/python3.12/site-packages (2.3.1)
Requirement already satisfied: matplotlib in ./venv/lib/python3.12/site-packages (3.10.3)
Requirement already satisfied: seaborn in ./venv/lib/python3.12/site-packages (0.13.2)
Requirement already satisfied: plotly in ./venv/lib/python3.12/site-packages (6.2.0)
Requirement already satisfied: nbformat in ./venv/lib/python3.12/site-packages (5.10.4)
Requirement already satisfied: deepl in ./venv/lib/python3.12/site-packages (1.22.0)
Requirement already satisfied: dotenv in ./venv/lib/python3.12/site-packages (0.9.9)
Requirement already satisfied: wordcloud in ./venv/lib/python3.12/site-packages (1.9.4)
Requirement already satisfied: python-dateutil>=2.8.2 in ./venv/lib/python3.12/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.12/site-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in ./venv/lib/python3.12/site-packages (from pandas) (2025.2)
Requirement already satisfied: contourpy>=1.0.1 in ./venv/lib/python3.12/site-packages (from matplotlib) (1.3.2)
Requirement already satisfied: cycler>=0.10 in ./venv/lib/python3.12/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in ./venv/lib/python3.12/site-packages (from matplotlib) (4.58.5)
Requirement already satisfied: kiwisolver>=1.3.1 in ./venv/lib/python3.12/site-packages (from matplotlib) (1.4.8)
Requirement already satisfied: packaging>=20.0 in ./venv/lib/python3.12/site-packages (from matplotlib) (25.0)
Requirement already satisfied: pillow>=8 in ./venv/lib/python3.12/site-packages (from matplotlib) (11.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in ./venv/lib/python3.12/site-packages (from matplotlib) (3.2.3)
Requirement already satisfied: narwhals>=1.15.1 in ./venv/lib/python3.12/site-packages (from plotly) (1.46.0)
Requirement already satisfied: fastjsonschema>=2.15 in ./venv/lib/python3.12/site-packages (from nbformat) (2.21.1)
Requirement already satisfied: jsonschema>=2.6 in ./venv/lib/python3.12/site-packages (from nbformat) (4.24.0)
Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in ./venv/lib/python3.12/site-packages (from nbformat) (5.8.1)
Requirement already satisfied: traitlets>=5.1 in ./venv/lib/python3.12/site-packages (from nbformat) (5.14.3)
Requirement already satisfied: requests<3,>=2 in ./venv/lib/python3.12/site-packages (from deepl) (2.32.4)
Requirement already satisfied: python-dotenv in ./venv/lib/python3.12/site-packages (from dotenv) (1.1.1)
Requirement already satisfied: attrs>=22.2.0 in ./venv/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (25.3.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in ./venv/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (2025.4.1)
Requirement already satisfied: referencing>=0.28.4 in ./venv/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (0.36.2)
Requirement already satisfied: rpds-py>=0.7.1 in ./venv/lib/python3.12/site-packages (from jsonschema>=2.6->nbformat) (0.26.0)
Requirement already satisfied: platformdirs>=2.5 in ./venv/lib/python3.12/site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat) (4.3.8)
Requirement already satisfied: six>=1.5 in ./venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
Requirement already satisfied: charset_normalizer<4,>=2 in ./venv/lib/python3.12/site-packages (from requests<3,>=2->deepl) (3.4.2)
Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests<3,>=2->deepl) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests<3,>=2->deepl) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests<3,>=2->deepl) (2025.7.9)
Requirement already satisfied: typing-extensions>=4.4.0 in ./venv/lib/python3.12/site-packages (from referencing>=0.28.4->jsonschema>=2.6->nbformat) (4.14.1)
Note: you may need to restart the kernel to use updated packages.

import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import deepl

load_dotenv()

True

household1_df = pd.read_csv('./datasets/2.1_Дунаєць.xlsx - База листов.csv')
people1_df = pd.read_csv('./datasets/2.1_Дунаєць.xlsx - База людей.csv')

people2_df = pd.read_csv('./datasets/2.2_Дунаєць_1778.xlsx - База.csv', header=3)

family_total_structure_1_df = pd.read_csv('./datasets/2.1.1_Структура родини Дунаєць_1897.xlsx - Total.csv')
family_total_structure_2_df = pd.read_csv('./datasets/2.2.2_Структура родини Дунаєць_1778.xlsx - Total.csv')

household1_df = household1_df.loc[:, ['ID Домохозяйствао', 'Село/деревня', 'Хозяин', 'Сколько жилых строений', 'Всего наличного мужеского населения', 'Всего наличнаго женского населения', 'Постоянно живущаго М', 'Постоянно живущаго Ж', 'Приписанного здесь М', 'Приписанного здесь Ж']]

people1_df = people1_df.loc[:, ['ID строки в базе', 'ID Домохозяйствао', 'ID жилец', 'ФИО', 'Пол', 'Глава хозяйства и глава семьи', 'Возраст', 'Семейный статус', 'Сословие, состояние или звание', 'Здесь ли родился', 'Здесь ли обыкновенно проживает', 'Умеет ли читать', 'Обучение', 'Профессия главное', 'Профессия вспомогательное', 'Положение по воинской повинности']]

people2_df = people2_df.loc[:, ['ID наскрізна', 'ID домогосподарство', 'Чоловіки', 'Жінки', 'Імʼя', 'По-батькові', 'Прізвище', 'Родиний статус', 'Категорія', 'Клас', 'Соціальний статус', 'Вік']]

household1_df.rename(columns={
    'ID Домохозяйствао': 'id',
    'Село/деревня': 'village/hamlet',
    'Хозяин': 'host_name',
    'Сколько жилых строений': 'residential_buildings_count',
    'Всего наличного мужеского населения': 'total_male_count',
    'Всего наличнаго женского населения': 'total_female_count',
    'Постоянно живущаго М': 'permanent_male_count',
    'Постоянно живущаго Ж': 'permanent_female_count',
    'Приписанного здесь М': 'attributed_male_count',
    'Приписанного здесь Ж': 'attributed_female_count'
}, inplace=True)

people1_df.rename(columns={
    'ID строки в базе': 'id',
    'ID Домохозяйствао': 'household_id',
    'ID жилец': 'resident_id',
    'ФИО': 'name',
    'Пол': 'sex',
    'Глава хозяйства и глава семьи': 'family_social_status',
    'Возраст': 'age',
    'Семейный статус': 'marital status',
    'Сословие, состояние или звание': 'class',
    'Здесь ли родился': 'is_indigenous',
    'Здесь ли обыкновенно проживает': 'is_resident',
    'Умеет ли читать': 'can_read',
    'Обучение': 'education',
    'Профессия главное': 'main_profession',
    'Профессия вспомогательное': 'additional_profession',
    'Положение по воинской повинности': 'regulations_on_conscription'
}, inplace=True)

people2_df.rename(columns={
    'ID наскрізна': 'id',
    'ID домогосподарство': 'household_id',
    'Чоловіки': 'men_count',
    'Жінки': 'women_count',
    'Імʼя': 'fist_name',
    'По-батькові': 'patronymic',
    'Прізвище': 'surname',
    'Родиний статус': 'family_social_status',
    'Категорія': 'category',
    'Клас': 'class',
    'Соціальний статус': 'social_stauts',
    'Вік' : 'age'
}, inplace=True)

# Delete last rows with full empty data
people1_df = people1_df[people1_df.name.notna()]

people1_df.shape

(1545, 16)

people1_df.loc[:, 'name'] = people1_df.loc[:, 'name'].apply(str.strip)
people1_df.loc[:, 'surname'] = people1_df.loc[:, 'name'].str.extract(r'(\w+)\s', expand=False)
people1_df.loc[:, 'first_name'] = people1_df.loc[:, 'name'].str.extract(r'\s(\w+)\s', expand=False)

people1_df.patronymic = ''
people1_df.loc[people1_df.name.apply(lambda x: len(x.split(' '))) == 3, 'patronymic'] = people1_df \
    .loc[people1_df.name.apply(lambda x: len(x.split(' '))) == 3, 'name'] \
    .str.extract(r'\s\w+\s(\w+)', expand=False)

people1_df.drop(labels=['name'], axis=1, inplace=True)

people1_df.loc[~people1_df.age.str.isnumeric(), 'age'] = '0'

people1_df.age = people1_df.age.astype(np.int32)

people2_df.loc[:, ['household_id', 'category', 'class', 'surname']] = people2_df \
    .loc[:, ['household_id', 'category', 'class', 'surname']] \
    .ffill()

people2_df.household_id = people2_df.household_id.astype(np.int32)

people2_df['sex'] = ''
people2_df.loc[people2_df.men_count.notna(), 'sex'] = 'm'
people2_df.loc[people2_df.women_count.notna(), 'sex'] = 'f'

people2_df.drop(axis=1, labels=['men_count', 'women_count'], inplace=True)

people2_df.age = people2_df.age.astype(np.int32)

family_total_structure_1_df.rename(columns={
    'Категорія': 'category',
    'Клас': 'class',
    'Кількість': 'class_count',
    '%': 'category_percentage',
    '% кожного класу': 'class_percentage',
    'Усього по категоріях': 'category_total_count'
}, inplace=True)

family_total_structure_2_df.rename(columns={
    'Категорія': 'category',
    'Клас': 'class',
    'Кількість': 'class_count',
    '%': 'category_percentage',
    '% кожного класу': 'class_percentage',
    'Усього по категоріях': 'category_total_count'
}, inplace=True)

family_total_structure_1_df.loc[:, ['category', 'category_percentage']] = family_total_structure_1_df.loc[:, ['category', 'category_percentage']].ffill()

family_total_structure_2_df.loc[:, ['category', 'category_percentage']] = family_total_structure_2_df.loc[:, ['category', 'category_percentage']].ffill()

family_total_structure_1_df.loc[:, 'category_total_count'] = family_total_structure_1_df.loc[:, 'category_total_count'].bfill()

family_total_structure_2_df.loc[:, 'category_total_count'] = family_total_structure_2_df.loc[:, 'category_total_count'].bfill()

# Delete Total Column
family_total_structure_1_df = family_total_structure_1_df[family_total_structure_1_df.category != 'Усього']
family_total_structure_2_df = family_total_structure_2_df[family_total_structure_2_df.category != 'Усього']

(people1_df.age.max(), people2_df.age.max())

(np.int32(90), np.int32(84))

bins = [i for i in range(0, 100, 10)]
labels = [f'{i}-{i+10}' for i in range(0, 90, 10)]

people1_df.loc[:, 'age_group'] = ''
people2_df.loc[:, 'age_group'] = ''

people1_df.loc[:, 'age_group'] = pd.cut(people1_df.age, bins=bins, labels=labels)
people2_df.loc[:, 'age_group'] = pd.cut(people2_df.age, bins=bins, labels=labels)

men_ages_1 = people1_df.loc[people1_df.sex == 'm', :][['age_group']] \
    .groupby('age_group') \
    .agg(count=('age_group', 'size')) \
    .reset_index()

women_ages_1 = people1_df.loc[people1_df.sex == 'f', :][['age_group']] \
    .groupby('age_group') \
    .agg(count=('age_group', 'size')) \
    .reset_index()

men_ages_2 = people2_df.loc[people2_df.sex == 'm', :][['age_group']] \
    .groupby('age_group') \
    .agg(count=('age_group', 'size')) \
    .reset_index()

women_ages_2 = people2_df.loc[people2_df.sex == 'f', :][['age_group']] \
    .groupby('age_group') \
    .agg(count=('age_group', 'size')) \
    .reset_index()

men_ages_1['count']

0    166
1    158
2    130
3    101
4     78
5     55
6     34
7     16
8      3
Name: count, dtype: int64

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'bar'}, {'type': 'bar'}]],
    subplot_titles=(
        '1778 year',
        '1897 year',
    ),
    shared_yaxes=True
)

fig.add_trace(
    go.Bar(
        y=bins,
        x=pd.Series(men_ages_2['count']),
        orientation='h',
        name='Men 1778',
        hoverinfo='name+x',
        marker=dict(color='#005f73')
    ), row=1, col=1
),

fig.add_trace(
    go.Bar(
        y=bins,
        x=pd.Series(women_ages_2['count']) * -1,
        orientation='h',
        name='Women 1778',
        textposition=None,
        customdata=pd.Series(women_ages_2['count']),
        hovertemplate='%{customdata}',
        marker=dict(color='#ae2012')
    ), row=1, col=1
)

fig.add_trace(
    go.Bar(
        y=bins,
        x=pd.Series(men_ages_1['count']),
        orientation='h',
        name='Men 1897',
        hoverinfo='name+x',
        marker=dict(color='#94d2bd')
    ), row=1, col=2
)

fig.add_trace(
    go.Bar(
        y=bins,
        x=pd.Series(women_ages_1['count']) * -1,
        orientation='h',
        name='Women 1897',
        hoverinfo='name',
        customdata=pd.Series(women_ages_1['count']),
        hovertemplate='%{customdata}',
        textposition=None,
        marker=dict(color='#ca6702')
    ), row=1, col=2
)

xaxis = dict(
    range=[-250, 250],
    title='Number'
)

fig.update_layout(
    yaxis=go.layout.YAxis(title='Age', range=[-8, 98], tickvals=bins, ticktext=labels),
    xaxis=dict(**xaxis),
    xaxis2=dict(**xaxis),
    title_text="Sex-age pyramid",
    title_font=dict(size=30),
    height=500,
    width=1000,
    barmode='overlay',
    bargap=0.1,
)

fig.show()

people1_ages_df = people1_df.loc[:, ['age', 'sex']]
people1_ages_df.loc[:, 'year'] = 1897
people1_ages_df.loc[people1_ages_df.sex == 'm', 'sex'] = 'male'
people1_ages_df.loc[people1_ages_df.sex == 'f', 'sex'] = 'female'

people2_ages_df = people2_df.loc[:, ['age', 'sex']]
people2_ages_df.loc[:, 'year'] = 1778
people2_ages_df.loc[people2_ages_df.sex == 'm', 'sex'] = 'male'
people2_ages_df.loc[people2_ages_df.sex == 'f', 'sex'] = 'female'

data_concated_ages = pd.concat([people2_ages_df, people1_ages_df])

plt.title('Age boxplot')
sns.boxplot(
    data=data_concated_ages,
    y='age',
    x='year',
    hue='sex',
    legend='brief',
    palette=['#0a9396', '#bb3e03'],
)
sns.despine(offset=10, trim=True)
plt.xlabel('Year')
plt.ylabel('Age')
plt.show()

mean_age1 = people1_df.age.mean()
mean_age2 = people2_df.age.mean()

people_count1 = people1_df.age.count()
people_count2 = people2_df.age.count()

men_count1 = men_ages_1.count()
men_count2 = men_ages_2.count()

mean_men1 = men_ages_1.mean()
mean_men2 = men_ages_2.mean()

mean_women1 = women_ages_1.mean() * -1
mean_women2 = women_ages_2.mean() * -1

print("Some more statistics in 1778 and 1897 years:")
print(f"The average age - {round(mean_age2, 2)} and {round(mean_age1, 2)}")
print(f"The number of people - {people_count2} and {people_count1}")
print(f"The men proportion - {round(men_count2 / people_count2, 2) * 100}% and {round(men_count1 / people_count1, 2) * 100}%")
print(f"The men average age - {round(mean_men2, 2)} and {round(mean_men1, 2)}")
print(f"The women average age - {round(mean_women2, 2)} and {round(mean_women1, 2)}")

Some more statistics in 1778 and 1897 years:
The average age - 23.53 and 25.42
The number of people - 1597 and 1545
The men proportion - 52.0% and 50.0%
The men average age - 23.58 and 26.63
The women average age - 23.47 and 24.2

family_size_counts_1 = people1_df.groupby('household_id')['id'] \
    .count() \
    .reset_index() \
    .rename(columns={'id': 'count'}) \
    .sort_values('count')

family_size_counts_1.loc[:, 'year'] = 1897

family_size_counts_2 = people2_df.groupby('household_id')['id'] \
    .count() \
    .reset_index() \
    .rename(columns={'id': 'count'}) \
    .sort_values('count')

family_size_counts_2.loc[:, 'year'] = 1778

family_size = pd.concat([family_size_counts_1, family_size_counts_2])

plt.title('Log Density plot of family size in 1778/1897 years')
ax = sns.kdeplot(
    data=family_size,
    x='count',
    hue='year',
    log_scale=True,
    fill=True,
    alpha=0.7,
    palette=['#0a9396', '#bb3e03']
)
plt.xlabel('Log Count')
plt.show()

print(f"The average family size in 1778 is {round(family_size_counts_2['count'].mean(), 2)}")
print(f"And the average family size in 1897 is {round(family_size_counts_1['count'].mean(), 2)}")

The average family size in 1778 is 15.21
And the average family size in 1897 is 5.94

fig = make_subplots(
    rows=2,
    cols=2,
    specs=[[{'type': 'pie'}, {'type': 'pie'}], [{'type': 'bar', 'colspan': 2}, None]],
    subplot_titles=(
        'People category distribution in 1778',
        'People category distribution in 1897',
        'People class distribution in 1778/1897'
    ),
    vertical_spacing=0.15
)

pie_colors = ['#001219', '#0a9396', '#e9d8a6', '#ca6702', '#ae2012']

fig.add_trace(
    go.Pie(
        labels=family_total_structure_1_df.category.unique(),
        values=family_total_structure_1_df.groupby('category')['category_total_count'] \
            .first() \
            .reset_index() \
            ['category_total_count'],
        textinfo='value',
        hoverinfo='label+percent',
        marker=dict(colors=pie_colors, line=dict(color='#000000', width=2)),
        textfont_size=20,
        hole=0.3
    ),
    row=1, col=2
)

fig.add_trace(
    go.Pie(
        labels=family_total_structure_2_df.category.unique(),
        values=family_total_structure_2_df.groupby('category')['category_total_count'] \
            .first() \
            .reset_index() \
            ['category_total_count'],
        textinfo='value',
        hoverinfo='label+percent',
        marker=dict(colors=pie_colors, line=dict(color='#000000', width=2)),
        textfont_size=20,
        hole=0.3
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        name='1778',
        x=family_total_structure_2_df.loc[:, 'class'],
        y=family_total_structure_2_df.loc[:, 'class_percentage'] / 100,
        marker_color='#9b2226'
    ),
    row=2, col='all'
)

fig.add_trace(
    go.Bar(
        name='1897',
        x=family_total_structure_1_df.loc[:, 'class'],
        y=family_total_structure_1_df.loc[:, 'class_percentage'] / 100,
        marker_color='#005f73'
    ),
    row=2, col='all'
)

fig.update_yaxes(
    tickformat=".0%",
)

fig.update_traces(textposition='inside')
fig.update_layout(
    barmode='group',
    height=900,
    width=1000,
    uniformtext_minsize=12,
    uniformtext_mode='hide',
)

fig.show()

auth_key = os.getenv('AUTH_KEY')

deepl_client = deepl.DeepLClient(auth_key)

surnames_1 = people1_df[people1_df.sex == 'm'] \
    .groupby(['household_id'])['surname'] \
    .first() \
    .reset_index()

surnames_1.loc[:, 'surname_translated'] = ''
surnames_1.loc[:, 'surname_translated'] = surnames_1.surname.apply(lambda x: deepl_client.translate_text(x, target_lang="UK").text)

surnames_2 = people2_df.groupby(['household_id'])['surname'] \
    .first() \
    .reset_index() \
    ['surname']

(len(surnames_1), len(surnames_2))

(256, 105)

surnames_1 = pd.Series(surnames_1['surname_translated'].str.lower())
surnames_2 = surnames_2.str.lower()

# Not include last symbols at the end of the word
surnames_1 = surnames_1.str.extract(r'(\w+)', expand=False)
surnames_2 = surnames_2.str.extract(r'(\w+)', expand=False)

similar_surnames_text = ''

for surname in list(surnames_2):
    if surname in set(surnames_1):
        similar_surnames_text += surname + ' '

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(similar_surnames_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Census Analyzing in the Dunaec village (1778/1897)¶

Imprort Libraries¶

Read data¶

Preprocess data¶

Visualizing Insights¶

Age Distribution¶

Family Size¶

Family Total Structure Comparison¶

Surname Comparison¶

Conclusions¶